import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from scipy.spatial.distance import pdist
from sklearn import metrics
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabaz_score
data = pd.read_csv("data.csv")
data.head(10)
data.shape
data.describe()
import seaborn as sns
sns.pairplot(data,plot_kws={'alpha':0.5})
plt.show()
d = {'euclidean': pdist(data, 'euclidean'),
'cityblock': pdist(data, 'cityblock'),
'minkowski_6': pdist(data, 'minkowski', p=6),
'cosine': pdist(data, 'cosine'),
'chebyshev': pdist(data, 'chebyshev'),
'canberra': pdist(data, 'canberra')}
D = pd.DataFrame(d)
D.shape
D_corr = D.corr().loc[['cityblock', 'euclidean', 'minkowski_6', 'chebyshev', 'canberra', 'cosine'],
['cityblock', 'euclidean', 'minkowski_6', 'chebyshev', 'canberra', 'cosine']
]
D_corr
sns.heatmap(D_corr, annot=True)
plt.show()
from sklearn import preprocessing
data_normalized = pd.DataFrame(preprocessing.normalize(data,
norm="l2"),
columns=data.columns)
data_normalized.head(10)
from sklearn.cluster import DBSCAN
EPS=-1
min_smp=-1
Max=-1
j=0.07
step=0.01
for i in range(3,9,1):
j=0.07
while(j<0.1):
cl_dbscan = DBSCAN(eps=j, min_samples=i, metric='euclidean')
cl_dbscan.fit(data_normalized)
ME = metrics.calinski_harabaz_score(data_normalized, cl_dbscan.labels_)
#print(ME)
#print(i)
#print(j)
if ME>Max:
Max=ME
EPS=i
min_smp=j
j+=step
print(Max)
print(EPS)
print(min_smp)
EPS=-1
min_smp=-1
Max=-1
j=0.07
step=0.01
for i in range(3,9,1):
j=0.07
while(j<0.1):
cl_dbscan = DBSCAN(eps=j, min_samples=i, metric='euclidean')
cl_dbscan.fit(data_normalized)
ME = metrics.silhouette_score(data_normalized, cl_dbscan.labels_)
#print(ME)
#print(i)
#print(j)
if ME>Max:
Max=ME
EPS=i
min_smp=j
j+=step
print(Max)
print(EPS)
print(min_smp)
cl_dbscan = DBSCAN(eps=0.09, min_samples=5, metric='euclidean')
cl_dbscan.fit(data_normalized)
np.shape(cl_dbscan.labels_)
data['points'] = 'Reachable'
data.iloc[cl_dbscan.core_sample_indices_, 5] = 'Core'
data.loc[cl_dbscan.labels_ == -1, 'points'] = 'Outlier'
sns.pairplot(hue='points',
data=data,
)
plt.show()
pd.Series(cl_dbscan.labels_).value_counts()
data['cl_dbscan'] = cl_dbscan.labels_
sns.pairplot(
hue='cl_dbscan',
data=data,
)
plt.show()
cl_dbscan = DBSCAN(eps=0.08, min_samples=7, metric='euclidean')
cl_dbscan.fit(data_normalized)
data['points'] = 'Reachable'
data.iloc[cl_dbscan.core_sample_indices_, 5] = 'Core'
data.loc[cl_dbscan.labels_ == -1, 'points'] = 'Outlier'
data['cl_dbscan'] = cl_dbscan.labels_
sns.pairplot(
hue='cl_dbscan',
data=data,
)
plt.show()
EPS=-1
min_smp=-1
Max=-1
j=0.01
step=0.01
for i in range(3,9,1):
j=0.09
while(j<0.19):
cl_dbscan = DBSCAN(eps=j, min_samples=i, metric='canberra')
cl_dbscan.fit(data_normalized)
ME = metrics.calinski_harabaz_score(data_normalized, cl_dbscan.labels_)
#print(ME)
#print(i)
#print(j)
if ME>Max:
Max=ME
EPS=i
min_smp=j
j+=step
print(Max)
print(EPS)
print(min_smp)
cl_dbscan = DBSCAN(eps=0.18, min_samples=6, metric='canberra')
cl_dbscan.fit(data_normalized)
data['points'] = 'Reachable'
data.iloc[cl_dbscan.core_sample_indices_, 5] = 'Core'
data.loc[cl_dbscan.labels_ == -1, 'points'] = 'Outlier'
data['cl_dbscan'] = cl_dbscan.labels_
sns.pairplot(
hue='cl_dbscan',
data=data,
)
plt.show()
EPS=-1
min_smp=-1
Max=-1
j=0.09
step=0.01
for i in range(3,9,1):
j=0.09
while(j<0.2):
cl_dbscan = DBSCAN(eps=j, min_samples=i, metric='canberra')
cl_dbscan.fit(data_normalized)
ME = metrics.silhouette_score(data_normalized, cl_dbscan.labels_)
#print(ME)
#print(i)
#print(j)
if ME>Max:
Max=ME
EPS=i
min_smp=j
j+=step
print(Max)
print(EPS)
print(min_smp)
cl_dbscan = DBSCAN(eps=0.19, min_samples=6, metric='canberra')
cl_dbscan.fit(data_normalized)
data['points'] = 'Reachable'
data.iloc[cl_dbscan.core_sample_indices_, 5] = 'Core'
data.loc[cl_dbscan.labels_ == -1, 'points'] = 'Outlier'
data['cl_dbscan'] = cl_dbscan.labels_
sns.pairplot(
hue='cl_dbscan',
data=data,
)
plt.show()
EPS=-1
min_smp=-1
Max=-1
j=0.07
step=0.01
for i in range(3,9,1):
j=0.07
while(j<0.1):
cl_dbscan = DBSCAN(eps=j, min_samples=i, metric='chebyshev')
cl_dbscan.fit(data_normalized)
ME = metrics.calinski_harabaz_score(data_normalized, cl_dbscan.labels_)
#print(ME)
#print(i)
#print(j)
if ME>Max:
Max=ME
EPS=i
min_smp=j
j+=step
print(Max)
print(EPS)
print(min_smp)
cl_dbscan = DBSCAN(eps=0.08, min_samples=7, metric='chebyshev')
cl_dbscan.fit(data_normalized)
data['points'] = 'Reachable'
data.iloc[cl_dbscan.core_sample_indices_, 5] = 'Core'
data.loc[cl_dbscan.labels_ == -1, 'points'] = 'Outlier'
data['cl_dbscan'] = cl_dbscan.labels_
sns.pairplot(
hue='cl_dbscan',
data=data,
)
plt.show()
EPS=-1
min_smp=-1
Max=-1
j=0.09
step=0.01
for i in range(3,9,1):
j=0.09
while(j<0.11):
cl_dbscan = DBSCAN(eps=j, min_samples=i, metric='canberra')
cl_dbscan.fit(data_normalized)
ME = metrics.silhouette_score(data_normalized, cl_dbscan.labels_)
#print(ME)
#print(i)
#print(j)
if ME>Max:
Max=ME
EPS=i
min_smp=j
j+=step
print(Max)
print(EPS)
print(min_smp)
cl_dbscan = DBSCAN(eps=0.1, min_samples=4, metric='chebyshev')
cl_dbscan.fit(data_normalized)
data['points'] = 'Reachable'
data.iloc[cl_dbscan.core_sample_indices_, 5] = 'Core'
data.loc[cl_dbscan.labels_ == -1, 'points'] = 'Outlier'
data['cl_dbscan'] = cl_dbscan.labels_
sns.pairplot(
hue='cl_dbscan',
data=data,
)
plt.show()
%%time
cl_dbscan = DBSCAN(eps=0.0055, min_samples=9, metric='cosine')
cl_dbscan.fit(data_normalized)
data['points'] = 'Reachable'
data.iloc[cl_dbscan.core_sample_indices_, 5] = 'Core'
data.loc[cl_dbscan.labels_ == -1, 'points'] = 'Outlier'
sns.pairplot(hue='points',
data=data,
)
plt.show()
data['cl_dbscan'] = cl_dbscan.labels_
sns.pairplot(
hue='cl_dbscan',
data=data,
)
plt.show()
%%time
cl_dbscan = DBSCAN(eps=0.3, min_samples=8, metric='canberra')
cl_dbscan.fit(data_normalized)
data['points'] = 'Reachable'
data.iloc[cl_dbscan.core_sample_indices_, 5] = 'Core'
data.loc[cl_dbscan.labels_ == -1, 'points'] = 'Outlier'
sns.pairplot(hue='points',
data=data,
)
plt.show()
data['cl_dbscan'] = cl_dbscan.labels_
sns.pairplot(
hue='cl_dbscan',
data=data,
)
plt.show()
cl_dbscan = DBSCAN(eps=0.3, min_samples=8, metric='canberra')
cl_dbscan.fit(data_normalized)
ME = metrics.silhouette_score(data_normalized, cl_dbscan.labels_)
ME
data = pd.read_csv("data.csv")
cl_dbscan = DBSCAN(eps=0.3, min_samples=8, metric='canberra')
dataNorm = preprocessing.normalize(data)
cl_dbscan.fit(dataNorm)
ME = metrics.calinski_harabaz_score(dataNorm, cl_dbscan.labels_)
ME